### import section
# Import libraries with reasons
# Need to use numpy & pandas dataframe
import numpy as np
import pandas as pd
from pandas import DataFrame
### Import matplotlib:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import matplotlib.patches as mpatches # needed for waffle Charts
import folium
import seaborn as sns
from PIL import Image # converting images into arrays
from wordcloud import WordCloud
mpl.style.use('ggplot')
# library to handle requests
import requests
# To replace multiple strings in one go
import re
# module to convert an address into latitude and longitude values
import geocoder
from geopy.geocoders import Nominatim
# libraries for displaying images
from IPython.display import Image
from IPython.core.display import HTML
from pandas.io.json import json_normalize
from datetime import datetime
# Creating the World Master DataFrame with date wise Covid19 count
df_world_master = pd.read_csv('https://covid.ourworldindata.org/data/ecdc/full_data.csv')
print ("Master DataFrame got created with all countries")
### Copy the data to another DataFrame, and keep the original as it is
df_world_master2 = df_world_master.copy()
### Remove the unwanted fields
df_world_master2.drop(['weekly_cases', 'weekly_deaths', 'biweekly_cases', 'biweekly_deaths'], axis = 1, inplace = True)
### As we'll use the date function, converting to datetime
df_world_master2['date'] = pd.to_datetime(df_world_master2.date)
# As we are having daily data for couple of months, it'll keep fortnight data for reporting purpose
### Keep only 1st and 15th days of the month
df_world_master2 = df_world_master2.iloc[df_world_master2[df_world_master2['date'].dt.day == 1].index].append(df_world_master2.iloc[df_world_master2[df_world_master2['date'].dt.day == 16].index])
df_world_master2.sort_values(['location', 'date'], ascending=[True, True], inplace = True)
### Get the Data only for World
### The master DataFrame is having all countries, including consolidated info for World
df_on_world = pd.DataFrame(df_world_master2[df_world_master2['location'] == 'World'][['date', 'new_cases', 'new_deaths', 'total_cases', 'total_deaths']])
df_on_world.head()
figure(num = None, figsize = (20, 8), dpi = 80, facecolor = 'w', edgecolor = 'k')
plt.subplot(2, 1, 1)
plt.plot(df_on_world['date'], df_on_world['new_cases'], label = "Case")
plt.plot(df_on_world['date'], df_on_world['new_deaths'], label = "Death")
plt.title('World Covid19 2020 - New Counts')
plt.ylabel('New')
plt.legend()
plt.subplot(2, 1, 2)
plt.plot(df_on_world['date'], df_on_world['total_cases'], label = "Case")
plt.plot(df_on_world['date'], df_on_world['total_deaths'], label = "Death")
plt.title('World Covid19 2020 - Total Counts')
plt.xlabel('Months')
plt.ylabel('Total')
plt.legend()
# Display a figure.
plt.show()
### Keep the required fields only
df_on_world = df_on_world[['date', 'new_cases', 'new_deaths', 'total_cases', 'total_deaths']]
### Show the number in 'K'
df_on_world['total_cases'] = df_on_world['total_cases'] / 1000
df_on_world['total_deaths'] = df_on_world['total_deaths'] / 1000
plt.figure(figsize=(15, 10))
sns.set(font_scale=1.5)
sns.set_style('whitegrid')
ax = sns.regplot(x='total_cases', y='total_deaths', data=df_on_world, color='green', marker='+', scatter_kws={'s': 200})
ax.set(xlabel='Total Positive', ylabel='Total Death')
ax.set_title('World Covid19: Comparison between Positive & Deaths (In Thousand)')
### Extracting the top values
df_world_top = pd.DataFrame()
for affiliate in df_world_master2.sort_values(['date', 'total_cases'], ascending=[True, True]).tail(6)['location'].append(df_world_master2.sort_values(['date', 'total_deaths'], ascending=[True, True]).tail(6)['location'], ignore_index=True).unique():
df_world_top = df_world_top.append(df_world_master2[df_world_master2["location"] == affiliate])
### Remove the World record
df_world_top.drop(df_world_top[df_world_top['location'] == 'World'].index, inplace = True)
### As the number started growing from Mar2020, lets remove the previous data
df_world_top.drop(df_world_top[df_world_top['date'] < '2020-03-01'].index, inplace = True)
### Horizental Bar
objects = df_world_top['location'].unique()
y_count = np.arange(len(objects))
total_cases = df_world_top[df_world_top['date'] == '2020-09-01']['total_cases']
plt.barh(y_count, total_cases, align='center', alpha=0.5)
plt.yticks(y_count, objects)
plt.xlabel('Total Positive Cases')
plt.title('COVID19 - Highest Positive Countries')
plt.show()
plt.rcParams["figure.figsize"] = 20,10
labels = df_world_top['location'].unique()
## As the data is cumulating, considering the latest date alone
total_cases = df_world_top[df_world_top['date'] == '2020-09-01']['total_cases']
total_deaths = df_world_top[df_world_top['date'] == '2020-09-01']['total_deaths']
x = np.arange(len(labels)) # the label locations
width = 0.35 # the width of the bars
fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, total_cases, width, label='Identified')
rects2 = ax.bar(x + width/2, total_deaths, width, label='Death')
ax.set_ylabel('Count')
ax.set_title('Countes by Total Identified and Death (as on 01-Sep-2020)')
ax.set_xticks(x)
ax.set_xticklabels(labels)
ax.legend()
def autolabel(rects):
"""Attach a text label above each bar in *rects*, displaying its height."""
for rect in rects:
height = int(rect.get_height())
ax.annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
plt.show()
### Comparing the numbers between UK, USA, Russia, Brazil & India
figure(num = None, figsize = (20, 8), dpi = 80, facecolor = 'w', edgecolor = 'k')
plt.subplot(2, 1, 1)
plt.plot(df_world_top[df_world_top['location'] == 'United Kingdom']['date'], df_world_top[df_world_top['location'] == 'United Kingdom']['new_cases'], label = "United Kingdom")
plt.plot(df_world_top[df_world_top['location'] == 'Russia']['date'], df_world_top[df_world_top['location'] == 'Russia']['new_cases'], label = "Russia")
plt.plot(df_world_top[df_world_top['location'] == 'India']['date'], df_world_top[df_world_top['location'] == 'India']['new_cases'], label = "India")
plt.plot(df_world_top[df_world_top['location'] == 'Brazil']['date'], df_world_top[df_world_top['location'] == 'Brazil']['new_cases'], label = "Brazil")
plt.plot(df_world_top[df_world_top['location'] == 'United States']['date'], df_world_top[df_world_top['location'] == 'United States']['new_cases'], label = "United States")
plt.title('COVID19 - New Counts')
plt.ylabel('Total Positive')
plt.legend()
plt.subplot(2, 1, 2)
plt.plot(df_world_top[df_world_top['location'] == 'United Kingdom']['date'], df_world_top[df_world_top['location'] == 'United Kingdom']['new_deaths'], label = "United Kingdom")
plt.plot(df_world_top[df_world_top['location'] == 'Russia']['date'], df_world_top[df_world_top['location'] == 'Russia']['new_deaths'], label = "Russia")
plt.plot(df_world_top[df_world_top['location'] == 'India']['date'], df_world_top[df_world_top['location'] == 'India']['new_deaths'], label = "India")
plt.plot(df_world_top[df_world_top['location'] == 'Brazil']['date'], df_world_top[df_world_top['location'] == 'Brazil']['new_deaths'], label = "Brazil")
plt.plot(df_world_top[df_world_top['location'] == 'United States']['date'], df_world_top[df_world_top['location'] == 'United States']['new_deaths'], label = "United States")
plt.xlabel('Fortnight Dates')
plt.ylabel('Total Death')
plt.legend()
# Display a figure.
plt.show()
### Remove the 'World' record (where WORLD is an entry as country)
df_world_master2.drop(df_world_master2[df_world_master2['location'] == 'World'].index, inplace = True)
world_geo = r'world_countries.json'
# create a numpy array of length 6 and has linear spacing from the minium total immigration to the maximum total immigration
threshold_scale = np.linspace(df_world_master2[df_world_master2['date'] == '2020-09-01']['total_cases'].min(),
df_world_master2[df_world_master2['date'] == '2020-09-01']['total_cases'].max(),
6, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1 # make sure that the last value of the list is greater than the maximum immigration
# let Folium determine the scale.
world_map = folium.Map(location=[0, 0], zoom_start=2, tiles='Mapbox Bright')
world_map.choropleth(
geo_data=world_geo,
data= df_world_master2[df_world_master2['date'] == '2020-09-01'][['location', 'total_cases']],
columns=['location', 'total_cases'],
key_on='feature.properties.name',
threshold_scale=threshold_scale,
fill_color='YlOrRd',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='Covid19 World: Total Positive cases as on 01-Sep-2020',
reset=True
)
world_map
# Reading the wikipedia and extracting the info to list
df_usa_state_master = pd.read_csv('https://covidtracking.com/api/v1/states/daily.csv')
print ("USA Master DataFrame got created")
### Create dataset for US states
### Lets see US state wise status
df_usa_state_master2 = df_usa_state_master[df_usa_state_master['date'] == 20200901][['date', 'state', 'positive', 'negative', 'death']]
df_usa_state_master2.rename({'state': 'code'}, axis=1, inplace=True)
df_usa_state_master2.head()
### As we are having only state code, lets try to get the full state name
### Now get the US state code and update the file to match json file
us_state_name = pd.read_html('https://worldpopulationreview.com/states/state-abbreviations')
df_us_state_name = DataFrame(us_state_name[0])
### Rename the column
df_us_state_name.rename({'State': 'state_name', 'Code': 'code'}, axis=1, inplace=True)
### Delete the unwanted column
df_us_state_name.drop(['Abbreviation'], axis = 1, inplace = True)
df_us_state_name.head()
### Use pandas join function to update the state name based on state code
df_usa_state_master2 = pd.merge(df_usa_state_master2, df_us_state_name, on='code', how='inner')
df_usa_state_master2.head()
usa_geo = r'usa_states.json'
# create a numpy array of length 6 and has linear spacing from the minium total immigration to the maximum total immigration
threshold_scale = np.linspace(df_usa_state_master2['positive'].min(),
df_usa_state_master2['positive'].max(),
6, dtype=int)
threshold_scale = threshold_scale.tolist() # change the numpy array to a list
threshold_scale[-1] = threshold_scale[-1] + 1 # make sure that the last value of the list is greater than the maximum immigration
# let Folium determine the scale.
world_map = folium.Map(location=[40, -100], zoom_start=4, tiles='Mapbox Bright')
world_map.choropleth(
geo_data=usa_geo,
data=df_usa_state_master2,
columns=['state_name', 'positive'],
key_on='feature.properties.NAME',
threshold_scale=threshold_scale,
fill_color='YlOrRd',
fill_opacity=0.7,
line_opacity=0.2,
legend_name='USA Covid19',
reset=True
)
world_map
### Now lets make the data for 5 highest states
df_usa_state_master2 = df_usa_state_master2[df_usa_state_master2['date'] == 20200901].sort_values(['death'], ascending = [False]).head(5)
df_usa_state_master2.set_index('state_name', inplace = True)
df_usa_state_master2.head()
def create_waffle_chart(categories, values, height, width, colormap, value_sign=''):
# compute the proportion of each category with respect to the total
total_values = sum(values)
category_proportions = [(float(value) / total_values) for value in values]
# compute the total number of tiles
total_num_tiles = width * height # total number of tiles
print ('Comparison between highest 5 death numbers\nTotal number of tiles is', total_num_tiles)
# compute the number of tiles for each catagory
tiles_per_category = [round(proportion * total_num_tiles) for proportion in category_proportions]
# print out number of tiles per category
for i, tiles in enumerate(tiles_per_category):
print (df_usa_state_master2.index.values[i] + ': ' + str(tiles))
# initialize the waffle chart as an empty matrix
waffle_chart = np.zeros((height, width))
# define indices to loop through waffle chart
category_index = 0
tile_index = 0
# populate the waffle chart
for col in range(width):
for row in range(height):
tile_index += 1
# if the number of tiles populated for the current category
# is equal to its corresponding allocated tiles...
if tile_index > sum(tiles_per_category[0:category_index]):
# ...proceed to the next category
category_index += 1
# set the class value to an integer, which increases with class
waffle_chart[row, col] = category_index
# instantiate a new figure object
fig = plt.figure()
# use matshow to display the waffle chart
colormap = plt.cm.coolwarm
plt.matshow(waffle_chart, cmap=colormap)
plt.colorbar()
# get the axis
ax = plt.gca()
# set minor ticks
ax.set_xticks(np.arange(-.5, (width), 1), minor=True)
ax.set_yticks(np.arange(-.5, (height), 1), minor=True)
# add dridlines based on minor ticks
ax.grid(which='minor', color='w', linestyle='-', linewidth=2)
plt.xticks([])
plt.yticks([])
# compute cumulative sum of individual categories to match color schemes between chart and legend
values_cumsum = np.cumsum(values)
total_values = values_cumsum[len(values_cumsum) - 1]
# create legend
legend_handles = []
for i, category in enumerate(categories):
if value_sign == '%':
label_str = category + ' (' + str(values[i]) + value_sign + ')'
else:
label_str = category + ' (' + value_sign + str(values[i]) + ')'
color_val = colormap(float(values_cumsum[i])/total_values)
legend_handles.append(mpatches.Patch(color=color_val, label=label_str))
# add legend to chart
plt.legend(
handles=legend_handles,
loc='lower center',
ncol=len(categories),
bbox_to_anchor=(0., -0.2, 0.95, .1)
)
width = 40 # width of chart
height = 10 # height of chart
categories = df_usa_state_master2.index.values # categories
values = df_usa_state_master2['death'] # correponding values of categories
colormap = plt.cm.coolwarm # color map class
### And now let's call our function to create a waffle chart.
create_waffle_chart(categories, values, height, width, colormap)
# Reading the wikipedia and extracting the district info to list
df_usa_state_county_master = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
### Lets explore which state is having highest positive count
df_usa_state_master2.sort_values(['positive'], ascending = [False])
df_usa_state_county_master[df_usa_state_county_master['state'] == 'California']['county'].unique()
# instantiate a word cloud object
list_of_county = df_usa_state_county_master[df_usa_state_county_master['state'] == 'California']['county'].to_string()
usa_ca = WordCloud(background_color='white', max_words=10000)
# generate the word cloud
usa_ca.generate(list_of_county)
# display the word cloud
fig = plt.figure()
fig.set_figwidth(24) # set width
fig.set_figheight(18) # set height
plt.imshow(usa_ca, interpolation='bilinear')
plt.axis('off')
plt.show()
### Get the Data only for CA, USA
df_usa_ca_county = pd.DataFrame(df_usa_state_county_master[df_usa_state_county_master['state'] == 'California'])
df_usa_ca_county = df_usa_ca_county[df_usa_ca_county['date'] == '2020-09-01']
# Adding Latitude and Longitude columns for Districts and Wards
df_usa_ca_county[['lat', 'lon']] = pd.DataFrame([[0, 0]])
geolocator = Nominatim(user_agent="foursquare_agent")
# Updating the Latitude & Longitude for District DataFrame
for ind1 in df_usa_ca_county.index:
address = df_usa_ca_county['county'][ind1] + ', California'
location = geolocator.geocode(address)
if isinstance(location, type(None)):
df_usa_ca_county['lat'][ind1] = 0
df_usa_ca_county['lon'][ind1] = 0
else:
df_usa_ca_county['lat'][ind1] = location.latitude
df_usa_ca_county['lon'][ind1] = location.longitude
print ('Updated the latitude & longitude for California states')
df_usa_ca_county.head()
# Creating a function to generate Folium map based on different parameters
def CustomMap (Map, latitudes, longitudes, County, Stat):
for lat,lng,county, stat in zip(latitudes, longitudes, County, Stat):
label = '{}: {}'.format(county, stat)
label = folium.Popup(label, parse_html=True)
folium.CircleMarker(
[lat,lng],
radius=5,
popup=label,
color='yellow',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
parse_html=False).add_to(Map)
### Now lets check the counties for California
### The Map will also display the county wise Positive (+) or Death (-) cases
Map = folium.Map(location=[36.7783, -119.4179], zoom_start=6)
CustomMap (Map, df_usa_ca_county['lat'], df_usa_ca_county['lon'],df_usa_ca_county['county'], '(+) ' + df_usa_ca_county['cases'].astype(str) + ', (-)' + df_usa_ca_county['deaths'].astype(str))
Map
### Creating a DataFrame for highest 20 counties
df_usa_ca_county_top20 = df_usa_ca_county.sort_values(['cases'], ascending = False).head(20)
df_usa_ca_county_top20
# Individual Foursquare ID & Secret
CLIENT_ID = 'TAKDVTLNQKNRLAMYHVJ4GFRSCWBDTTMY2UOEEBZ5KD4HT1G3' # DR
CLIENT_SECRET = 'AKXUKTLJD0KFMKCTVO11A3F5Y2EHN5BIE3HSDTAXPK1P1IL1' # DR
VERSION = '20180604'
LIMIT = 75
radius = 2000
# Creating the function that extracts the category of the venue
def get_category_type(row):
try:
categories_list = row['categories']
except:
categories_list = row['venue.categories']
if len(categories_list) == 0:
return None
else:
return categories_list[0]['name']
# Setting the values to the District wise detail DataFrame
# Creating a new DataFrame to store the County wise results
df_usa_ca_county_venue = pd.DataFrame(columns=['state', 'county', 'venue', 'category', 'lat', 'lon'])
for ind1 in df_usa_ca_county_top20.index:
if df_usa_ca_county_top20['lat'][ind1] != 0 and df_usa_ca_county_top20['lon'][ind1] != 0:
latitude = df_usa_ca_county_top20['lat'][ind1]
longitude = df_usa_ca_county_top20['lon'][ind1]
### Setting the dynamic URL, getting the result and storing into DataFrame
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, latitude, longitude, VERSION, radius, LIMIT)
results = requests.get(url).json()
df_result = json_normalize(results['response']['venues'])
# keep only columns that include venue name, and anything that is associated with location
req_cols = ['name', 'categories'] + [col for col in df_result.columns if col.startswith('location.')] + ['id']
df_result_req = df_result.loc[:, req_cols]
# filter the category for each row
df_result_req['categories'] = df_result_req.apply(get_category_type, axis=1)
# clean column names by keeping only last term
df_result_req.columns = [column.split('.')[-1] for column in df_result_req.columns]
for ind2 in df_result_req.index:
df_usa_ca_county_venue = df_usa_ca_county_venue.append({'state': df_usa_ca_county_top20['state'][ind1], 'county': df_usa_ca_county_top20['county'][ind1], 'venue': df_result_req['name'][ind2], 'category': df_result_req['categories'][ind2], 'lat': df_result_req['lat'][ind2], 'lon':df_result_req['lng'][ind2]}, ignore_index=True)
### Store only medical support related venues
df_usa_ca_county_venue2 = df_usa_ca_county_venue[df_usa_ca_county_venue['category'] == 'Pharmacy']
df_usa_ca_county_venue2 = df_usa_ca_county_venue2.append(df_usa_ca_county_venue[df_usa_ca_county_venue['category'] == 'Medical Center'])
df_usa_ca_county_venue2 = df_usa_ca_county_venue2.append(df_usa_ca_county_venue[df_usa_ca_county_venue['category'] == 'Doctor\'s Office'])
df_usa_ca_county_venue2 = df_usa_ca_county_venue2.append(df_usa_ca_county_venue[df_usa_ca_county_venue['category'] == 'Hospital'])
df_usa_ca_county_venue2 = df_usa_ca_county_venue2.append(df_usa_ca_county_venue[df_usa_ca_county_venue['category'] == 'Medical Lab'])
### lets see what are the category & venues we are having
df_usa_ca_county_venue2.sort_values(['county'])
### Let's try to locate the medical support related venues
Map = folium.Map(location=[df_usa_ca_county_venue2.loc[df_usa_ca_county_venue2['county'] == 'Riverside']['lat'].mean(), df_usa_ca_county_venue2.loc[df_usa_ca_county_venue2['county'] == 'Riverside']['lon'].mean()], zoom_start=15)
CustomMap (Map,df_usa_ca_county_venue2.loc[df_usa_ca_county_venue2['county'] == 'Riverside']['lat'], df_usa_ca_county_venue2.loc[df_usa_ca_county_venue2['county'] == 'Riverside']['lon'], df_usa_ca_county_venue2.loc[df_usa_ca_county_venue2['county'] == 'Riverside']['venue'], df_usa_ca_county_venue2.loc[df_usa_ca_county_venue2['county'] == 'Riverside']['category'])
Map